home *** CD-ROM | disk | FTP | other *** search
/ InterCD 2001 May / may_2001.iso / intercd / root / Multimedia / ^DivX_Article / virtualdub / VirtualDub-source-1_4d / MJPEGDecoder.cpp < prev    next >
Encoding:
C/C++ Source or Header  |  2001-03-17  |  33.2 KB  |  1,347 lines

  1. //    VirtualDub - Video processing and capture application
  2. //    Copyright (C) 1998-2000 Avery Lee
  3. //
  4. //    This program is free software; you can redistribute it and/or modify
  5. //    it under the terms of the GNU General Public License as published by
  6. //    the Free Software Foundation; either version 2 of the License, or
  7. //    (at your option) any later version.
  8. //
  9. //    This program is distributed in the hope that it will be useful,
  10. //    but WITHOUT ANY WARRANTY; without even the implied warranty of
  11. //    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  12. //    GNU General Public License for more details.
  13. //
  14. //    You should have received a copy of the GNU General Public License
  15. //    along with this program; if not, write to the Free Software
  16. //    Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
  17.  
  18. #include <stdio.h>
  19. #include <stdlib.h>
  20. #include <io.h>
  21. #include <fcntl.h>
  22. #include <math.h>
  23. #include <crtdbg.h>
  24. #include <windows.h>
  25. #include <conio.h>
  26.  
  27. #include "MJPEGDecoder.h"
  28.  
  29. //#define DCTLEN_PROFILE
  30. //#define PROFILE
  31.  
  32.  
  33.  
  34. #ifdef DCTLEN_PROFILE
  35. extern "C" {
  36.     long short_coeffs, med_coeffs, long_coeffs;
  37. };
  38. #endif
  39.  
  40. ///////////////////////////////////////////////////////////////////////////
  41. //
  42. //        Externs
  43. //
  44. ///////////////////////////////////////////////////////////////////////////
  45.  
  46. typedef unsigned char byte;
  47. typedef unsigned long dword;
  48.  
  49. class MJPEGBlockDef {
  50. public:
  51.     const byte *huff_dc;
  52.     const byte *huff_ac;
  53.     const byte (*huff_ac_quick)[2];
  54.     const byte (*huff_ac_quick2)[2];
  55.     const int *quant;
  56.     int *dc_ptr;
  57.     int    ac_last;
  58. };
  59. extern "C" void asm_mb_decode(dword& bitbuf, int& bitcnt, byte *& ptr, int mcu_length, MJPEGBlockDef *pmbd, short **dctarray);
  60.  
  61. extern "C" void IDCT_mmx(signed short *dct_coeff, void *dst, long pitch, int intra_flag, int ac_last);
  62.  
  63. ///////////////////////////////////////////////////////////////////////////
  64. //
  65. //        Tables
  66. //
  67. ///////////////////////////////////////////////////////////////////////////
  68.  
  69. static const char MJPEG_zigzag[64] = {        // the reverse zigzag scan order
  70.          0,  1,  8, 16,  9,  2,  3, 10,
  71.         17, 24, 32, 25, 18, 11,  4,  5,
  72.         12, 19, 26, 33, 40, 48, 41, 34,
  73.         27, 20, 13,  6,  7, 14, 21, 28,
  74.         35, 42, 49, 56, 57, 50, 43, 36,
  75.         29, 22, 15, 23, 30, 37, 44, 51,
  76.         58, 59, 52, 45, 38, 31, 39, 46,
  77.         53, 60, 61, 54, 47, 55, 62, 63,
  78. };
  79.  
  80. // Huffman tables
  81.  
  82. static const byte huff_dc_0[] = {    // DC table 0
  83.     0x00,0x01,0x05,0x01,0x01,0x01,0x01,0x01,0x01,0x00,0x00,0x00,0x00,0x00,0x00,0x00,    // counts by bit length
  84.     0x00,0x01,0x02,0x03,0x04,0x05,0x06,0x07,0x08,0x09,0x0A,0x0B,    // values
  85. };
  86.  
  87. static const byte huff_dc_1[] = {    // DC table 1
  88.     0x00,0x03,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x00,0x00,0x00,0x00,0x00,
  89.     0x00,0x01,0x02,0x03,0x04,0x05,0x06,0x07,0x08,0x09,0x0A,0x0B,
  90. };
  91.  
  92. static const byte huff_ac_0_quick[][2]={
  93. #if 0
  94.     0x01,2,    // 0000-3FFF
  95.     0x01,2,
  96.     0x01,2,
  97.     0x01,2,
  98.     0x02,2,    // 4000-7FFF
  99.     0x02,2,
  100.     0x02,2,
  101.     0x02,2,
  102.     0x03,3,    // 8000-9FFF
  103.     0x03,3,
  104.     0x00,4,    // A000-AFFF
  105. #endif
  106.  
  107. /* 00-0F */ 0xFF,3,0xFF,3,0xFF,3,0xFF,3,0xFF,3,0xFF,3,0xFF,3,0xFF,3,
  108. /* 10-1F */ 0xFF,3,0xFF,3,0xFF,3,0xFF,3,0xFF,3,0xFF,3,0xFF,3,0xFF,3,
  109. /* 20-2F */ 0x01,3,0x01,3,0x01,3,0x01,3,0x01,3,0x01,3,0x01,3,0x01,3,
  110. /* 30-3F */ 0x01,3,0x01,3,0x01,3,0x01,3,0x01,3,0x01,3,0x01,3,0x01,3,
  111. /* 40-4F */ 0xFD,4,0xFD,4,0xFD,4,0xFD,4,0xFD,4,0xFD,4,0xFD,4,0xFD,4,
  112. /* 50-5F */ 0xFE,4,0xFE,4,0xFE,4,0xFE,4,0xFE,4,0xFE,4,0xFE,4,0xFE,4,
  113. /* 60-6F */ 0x02,4,0x02,4,0x02,4,0x02,4,0x02,4,0x02,4,0x02,4,0x02,4,
  114. /* 70-7F */    0x03,4,0x03,4,0x03,4,0x03,4,0x03,4,0x03,4,0x03,4,0x03,4,
  115. /* 80-8F */ 0xF9,6,0xF9,6,0xFA,6,0xFA,6,0xFB,6,0xFB,6,0xFC,6,0xFC,6,
  116. /* 90-9F */ 0x04,6,0x04,6,0x05,6,0x05,6,0x06,6,0x06,6,0x07,6,0x07,6,
  117. /* A0-AF */    0x00,4,0x00,4,0x00,4,0x00,4,0x00,4,0x00,4,0x00,4,0x00,4,
  118. };
  119.  
  120. static byte huff_ac_0_quick2[0x1000 - 0xB00][2];
  121.  
  122. static const byte huff_ac_0[]={        // AC table 0
  123. //    0x00,0x02,0x01,0x03,0x03,0x02,0x04,0x03,0x05,0x05,0x04,0x04,0x00,0x00,0x01,0x7D,    // 0xe2 values
  124.  
  125. /*
  126.     0x01,0x02,                    // (00-01) 0000-7FFF
  127.     0x03,                        // (02)    8000-9FFF
  128.     0x00,0x04,0x11,                // (03-05) A000-CFFF
  129.     0x05,0x12,0x21,                // (06-08) D000-E7FF
  130.     0x31,0x41,                    // (09-0A) E800-EFFF
  131.     0x06,0x13,0x51,0x61,        // (0B-0E) F000-F7FF
  132.     0x07,0x22,0x71,                // (0F-11) F800-FAFF
  133.     0x14,0x32,0x81,0x91,0xA1,    // (12-16) FB00-FD7F
  134.     0x08,0x23,0x42,0xB1,0xC1,    // (17-1B) FD80-FEBF
  135.     0x15,0x52,0xD1,0xF0,        // (1C-1F) FEC0-FF3F
  136.     0x24,0x33,0x62,0x72,        // (20-23) FF40-FF7F
  137. */
  138.     0x82,15,
  139.     0x82,15,
  140.  
  141.     0x09,16,0x0A,16,0x16,16,0x17,16,0x18,16,0x19,16,0x1A,16,0x25,16,0x26,16,0x27,16,0x28,16,0x29,16,0x2A,16,0x34,16,0x35,16,0x36,16,
  142.     0x37,16,0x38,16,0x39,16,0x3A,16,0x43,16,0x44,16,0x45,16,0x46,16,0x47,16,0x48,16,0x49,16,0x4A,16,0x53,16,0x54,16,0x55,16,0x56,16,
  143.     0x57,16,0x58,16,0x59,16,0x5A,16,0x63,16,0x64,16,0x65,16,0x66,16,0x67,16,0x68,16,0x69,16,0x6A,16,0x73,16,0x74,16,0x75,16,0x76,16,
  144.     0x77,16,0x78,16,0x79,16,0x7A,16,0x83,16,0x84,16,0x85,16,0x86,16,0x87,16,0x88,16,0x89,16,0x8A,16,0x92,16,0x93,16,0x94,16,0x95,16,
  145.     0x96,16,0x97,16,0x98,16,0x99,16,0x9A,16,0xA2,16,0xA3,16,0xA4,16,0xA5,16,0xA6,16,0xA7,16,0xA8,16,0xA9,16,0xAA,16,0xB2,16,0xB3,16,
  146.     0xB4,16,0xB5,16,0xB6,16,0xB7,16,0xB8,16,0xB9,16,0xBA,16,0xC2,16,0xC3,16,0xC4,16,0xC5,16,0xC6,16,0xC7,16,0xC8,16,0xC9,16,0xCA,16,
  147.     0xD2,16,0xD3,16,0xD4,16,0xD5,16,0xD6,16,0xD7,16,0xD8,16,0xD9,16,0xDA,16,0xE1,16,0xE2,16,0xE3,16,0xE4,16,0xE5,16,0xE6,16,0xE7,16,
  148.     0xE8,16,0xE9,16,0xEA,16,0xF1,16,0xF2,16,0xF3,16,0xF4,16,0xF5,16,0xF6,16,0xF7,16,0xF8,16,0xF9,16,0xFA,16,
  149. };
  150.  
  151. static const byte huff_ac_0_src[]={        // AC table 0
  152.     0x00,0x02,0x01,0x03,0x03,0x02,0x04,0x03,0x05,0x05,0x04,0x04,0x00,0x00,0x01,0x7D,    // 0xe2 values
  153.  
  154.     0x01,0x02,                    // (00-01) 0000-7FFF
  155.     0x03,                        // (02)    8000-9FFF
  156.     0x00,0x04,0x11,                // (03-05) A000-CFFF
  157.     0x05,0x12,0x21,                // (06-08) D000-E7FF
  158.     0x31,0x41,                    // (09-0A) E800-EFFF
  159.     0x06,0x13,0x51,0x61,        // (0B-0E) F000-F7FF
  160.     0x07,0x22,0x71,                // (0F-11) F800-FAFF
  161.     0x14,0x32,0x81,0x91,0xA1,    // (12-16) FB00-FD7F
  162.     0x08,0x23,0x42,0xB1,0xC1,    // (17-1B) FD80-FEBF
  163.     0x15,0x52,0xD1,0xF0,        // (1C-1F) FEC0-FF3F
  164.     0x24,0x33,0x62,0x72,        // (20-23) FF40-FF7F
  165. };
  166.  
  167. static const byte huff_ac_1_quick[][2]={
  168. #if 0
  169.     0x00,2,    // 0000-0FFF
  170.     0x00,2,    // 1000-1FFF
  171.     0x00,2,    // 2000-2FFF
  172.     0x00,2,    // 3000-3FFF
  173.     0x01,2,    // 4000-4FFF
  174.     0x01,2,    // 5000-5FFF
  175.     0x01,2,    // 6000-6FFF
  176.     0x01,2,    // 7000-7FFF
  177.     0x02,3,    // 8000-8FFF
  178.     0x02,3,    // 9000-9FFF
  179.     0x03,4,    // A000-AFFF
  180. #endif
  181.  
  182. /* 00-0F */ 0x00,2,0x00,2,0x00,2,0x00,2,0x00,2,0x00,2,0x00,2,0x00,2,
  183. /* 10-1F */ 0x00,2,0x00,2,0x00,2,0x00,2,0x00,2,0x00,2,0x00,2,0x00,2,
  184. /* 20-2F */ 0x00,2,0x00,2,0x00,2,0x00,2,0x00,2,0x00,2,0x00,2,0x00,2,
  185. /* 30-3F */ 0x00,2,0x00,2,0x00,2,0x00,2,0x00,2,0x00,2,0x00,2,0x00,2,
  186. /* 40-4F */ 0xFF,3,0xFF,3,0xFF,3,0xFF,3,0xFF,3,0xFF,3,0xFF,3,0xFF,3,
  187. /* 50-5F */ 0xFF,3,0xFF,3,0xFF,3,0xFF,3,0xFF,3,0xFF,3,0xFF,3,0xFF,3,
  188. /* 60-6F */ 0x01,3,0x01,3,0x01,3,0x01,3,0x01,3,0x01,3,0x01,3,0x01,3,
  189. /* 70-7F */ 0x01,3,0x01,3,0x01,3,0x01,3,0x01,3,0x01,3,0x01,3,0x01,3,
  190. /* 80-8F */ 0xFD,5,0xFD,5,0xFD,5,0xFD,5,0xFE,5,0xFE,5,0xFE,5,0xFE,5,
  191. /* 90-9F */ 0x02,5,0x02,5,0x02,5,0x02,5,0x03,5,0x03,5,0x03,5,0x03,5,
  192. /* A0-AF */    0xF9,7,0xFA,7,0xFB,7,0xFC,7,0x04,7,0x05,7,0x06,7,0x07,7,
  193. };
  194.  
  195. static const byte huff_ac_1[]={        // AC table 1
  196. //    0x00,0x02,0x01,0x02,0x04,0x04,0x03,0x04,0x07,0x05,0x04,0x04,0x00,0x01,0x02,0x77,
  197.  
  198. /*
  199.     0x00,0x01,                                // (00-01) 4000 0000-7FFF
  200.     0x02,                                    // (02)    2000 8000-9FFF
  201.     0x03,0x11,                                // (03-04) 1000 A000-BFFF
  202.     0x04,0x05,0x21,0x31,                    // (05-08) 0800 C000-DFFF
  203.     0x06,0x12,0x41,0x51,                    // (09-0C) 0400 E000-EFFF
  204.     0x07,0x61,0x71,                            // (0D-0F) 0200 F000-F5FF
  205.     0x13,0x22,0x32,0x81,                    // (10-13) 0100 F600-F9FF
  206.     0x08,0x14,0x42,0x91,0xA1,0xB1,0xC1,        // (14-1B) 0080 FA00-FD7F
  207.     0x09,0x23,0x33,0x52,0xF0,                // (1C-20) 0040 FD80-FEBF
  208.     0x15,0x62,0x72,0xD1,                    // (21-24) 0020 FEC0-FF3F
  209.     0x0A,0x16,0x24,0x34,                    // (25-28) 0010 FF40-FF80
  210. */
  211.     0xE1,14,
  212.     0xE1,14,
  213.     0xE1,14,
  214.     0xE1,14,
  215.  
  216.     0x25,15,0x25,15,
  217.     0xF1,15,0xF1,15,
  218.  
  219.     0x17,16,0x18,16,0x19,16,0x1A,16,0x26,16,0x27,16,0x28,16,0x29,16,0x2A,16,0x35,16,0x36,16,0x37,16,0x38,16,0x39,16,0x3A,16,0x43,16,
  220.     0x44,16,0x45,16,0x46,16,0x47,16,0x48,16,0x49,16,0x4A,16,0x53,16,0x54,16,0x55,16,0x56,16,0x57,16,0x58,16,0x59,16,0x5A,16,0x63,16,
  221.     0x64,16,0x65,16,0x66,16,0x67,16,0x68,16,0x69,16,0x6A,16,0x73,16,0x74,16,0x75,16,0x76,16,0x77,16,0x78,16,0x79,16,0x7A,16,0x82,16,
  222.     0x83,16,0x84,16,0x85,16,0x86,16,0x87,16,0x88,16,0x89,16,0x8A,16,0x92,16,0x93,16,0x94,16,0x95,16,0x96,16,0x97,16,0x98,16,0x99,16,
  223.     0x9A,16,0xA2,16,0xA3,16,0xA4,16,0xA5,16,0xA6,16,0xA7,16,0xA8,16,0xA9,16,0xAA,16,0xB2,16,0xB3,16,0xB4,16,0xB5,16,0xB6,16,0xB7,16,
  224.     0xB8,16,0xB9,16,0xBA,16,0xC2,16,0xC3,16,0xC4,16,0xC5,16,0xC6,16,0xC7,16,0xC8,16,0xC9,16,0xCA,16,0xD2,16,0xD3,16,0xD4,16,0xD5,16,
  225.     0xD6,16,0xD7,16,0xD8,16,0xD9,16,0xDA,16,0xE2,16,0xE3,16,0xE4,16,0xE5,16,0xE6,16,0xE7,16,0xE8,16,0xE9,16,0xEA,16,0xF2,16,0xF3,16,
  226.     0xF4,16,0xF5,16,0xF6,16,0xF7,16,0xF8,16,0xF9,16,0xFA,16
  227. };
  228.  
  229. static const byte huff_ac_1_src[]={        // AC table 1
  230.     0x00,0x02,0x01,0x02,0x04,0x04,0x03,0x04,0x07,0x05,0x04,0x04,0x00,0x01,0x02,0x77,
  231.  
  232.     0x00,0x01,                                // (00-01) 4000 0000-7FFF
  233.     0x02,                                    // (02)    2000 8000-9FFF
  234.     0x03,0x11,                                // (03-04) 1000 A000-BFFF
  235.     0x04,0x05,0x21,0x31,                    // (05-08) 0800 C000-DFFF
  236.     0x06,0x12,0x41,0x51,                    // (09-0C) 0400 E000-EFFF
  237.     0x07,0x61,0x71,                            // (0D-0F) 0200 F000-F5FF
  238.     0x13,0x22,0x32,0x81,                    // (10-13) 0100 F600-F9FF
  239.     0x08,0x14,0x42,0x91,0xA1,0xB1,0xC1,        // (14-1B) 0080 FA00-FD7F
  240.     0x09,0x23,0x33,0x52,0xF0,                // (1C-20) 0040 FD80-FEBF
  241.     0x15,0x62,0x72,0xD1,                    // (21-24) 0020 FEC0-FF3F
  242.     0x0A,0x16,0x24,0x34,                    // (25-28) 0010 FF40-FF80
  243. };
  244.  
  245. static byte huff_ac_1_quick2[0x1000 - 0xB00][2];
  246.  
  247. static const byte *huff_dc[2] = { huff_dc_0, huff_dc_1 };
  248. static const byte *huff_ac[2] = { huff_ac_0, huff_ac_1 };
  249. static const byte *huff_ac_src[2] = { huff_ac_0_src, huff_ac_1_src };
  250. static const byte (*huff_ac_quick[2])[2] = { huff_ac_0_quick, huff_ac_1_quick };
  251. static const byte (*huff_ac_quick2[2])[2] = { huff_ac_0_quick2, huff_ac_1_quick2 };
  252.  
  253. ///////////////////////////////////////////////////////////////////////////
  254. //
  255. //        Class definitions
  256. //
  257. ///////////////////////////////////////////////////////////////////////////
  258.  
  259.  
  260. class MJPEGDecoder : public IMJPEGDecoder {
  261. public:
  262.     MJPEGDecoder(int w, int h);
  263.     ~MJPEGDecoder();
  264.  
  265.     void decodeFrame16(dword *output, byte *input, int len);
  266.     void decodeFrame32(dword *output, byte *input, int len);
  267.  
  268. private:
  269.     int quant[4][128];                // quantization matrices
  270.     int width, height, field_height;
  271.     int mcu_width, mcu_height;        // size of frame when blocked into MCUs
  272.     int mcu_length;
  273.     int mcu_count;
  274.     int mcu_size_y;
  275.     int raw_width, raw_height;
  276.     int clip_row, clip_lines;
  277.     void *pixdst;
  278.  
  279.     int *comp_quant[3];
  280.     int comp_mcu_x[3], comp_mcu_y[3], comp_mcu_length[4];
  281.     int comp_last_dc[3];
  282.     int comp_id[3];
  283.     int comp_start[3];
  284.  
  285.     MJPEGBlockDef blocks[24];
  286.     short dct_coeff[24][64];
  287.     short *dct_coeff_ptrs[24];
  288.  
  289.     bool vc_half;                    // chrominance 2:1 vertically?
  290.     bool interlaced;
  291.     bool decode16;
  292.  
  293.     void decodeFrame(dword *output, byte *input, int len);
  294.     byte *decodeQuantTables(byte *psrc);
  295.     byte *decodeFrameInfo(byte *psrc);
  296.     byte *decodeScan(byte *ptr, bool odd_field);
  297.     byte __forceinline huffDecodeDC(dword& bitbuf, int& bitcnt, const byte * const table);
  298.     byte __forceinline huffDecodeAC(dword& bitbuf, int& bitcnt, const byte * const table);
  299.     byte *decodeMCUs(byte *ptr, bool odd_field);
  300. };
  301.  
  302. enum {
  303.     MARKER_SOF0    = 0xc0,        // start-of-frame, baseline scan
  304.     MARKER_SOI    = 0xd8,        // start of image
  305.     MARKER_EOI    = 0xd9,        // end of image
  306.     MARKER_SOS    = 0xda,        // start of scan
  307.     MARKER_DQT    = 0xdb,        // define quantization tables
  308.     MARKER_APP_FIRST    = 0xe0,
  309.     MARKER_APP_LAST        = 0xef,
  310.     MARKER_COMMENT        = 0xfe,
  311. };
  312.  
  313. ///////////////////////////////////////////////////////////////////////////
  314. //
  315. //        Construction/destruction
  316. //
  317. ///////////////////////////////////////////////////////////////////////////
  318.  
  319. MJPEGDecoder::MJPEGDecoder(int w, int h) {
  320.     this->vc_half            = false;
  321.     this->width                = w;
  322.     this->height            = h;
  323.  
  324.     for(int tbl=0; tbl<2; tbl++) {
  325.         int base=0;
  326.         byte *ptr = (byte *)huff_ac_quick2[tbl];
  327.         const byte *countptr = huff_ac_src[tbl];
  328.         const byte *codeptr = huff_ac_src[tbl] + 16;
  329.  
  330.         for(int bits=1; bits<=12; bits++) {
  331.             for(int cnt=0; cnt<*countptr; cnt++) {
  332.                 int first, last;
  333.  
  334.                 first = base;
  335.                 last = base + (0x1000 >> bits);
  336.  
  337.                 if (first < 0xB00)
  338.                     first = 0xB00;
  339.  
  340.                 while(first < last) {
  341.                     *ptr++ = *codeptr;
  342.                     *ptr++ = bits;
  343.                     ++first;
  344.                 }
  345.  
  346.                 base = last;
  347.  
  348.                 ++codeptr;
  349.             }
  350.  
  351.             ++countptr;
  352.         }
  353.  
  354.         _RPT2(0,"Code length for table %d: %04x\n", tbl, base);
  355.     }
  356. }
  357.  
  358. MJPEGDecoder::~MJPEGDecoder() {
  359. }
  360.  
  361. IMJPEGDecoder *CreateMJPEGDecoder(int w, int h) {
  362.     return new MJPEGDecoder(w, h);
  363. }
  364.  
  365. ///////////////////////////////////////////////////////////////////////////
  366.  
  367. ///////////////////////////////////////////////////////////////////////////
  368.  
  369.  
  370.  
  371. ///////////////////////////////////////////////////////////////////////////
  372.  
  373.  
  374. int __inline getshort(byte *p) {
  375.     return ((int)p[0]<<8) + (int)p[1];
  376. }
  377.  
  378.  
  379.  
  380. void MJPEGDecoder::decodeFrame16(dword *output, byte *ptr, int size) {
  381.     decode16 = true;
  382.     decodeFrame(output, ptr, size);
  383. }
  384.  
  385. void MJPEGDecoder::decodeFrame32(dword *output, byte *ptr, int size) {
  386.     decode16 = false;
  387.     decodeFrame(output, ptr, size);
  388. }
  389.  
  390. void MJPEGDecoder::decodeFrame(dword *output, byte *ptr, int size) {
  391.     byte *limit = ptr+size-1;
  392.     byte tag;
  393.     bool odd_field = true;
  394.     int field_count = 0;
  395.  
  396.     do {
  397. //        _RPT1(0,"Decoding %s field\n", odd_field ? "odd" : "even");
  398.  
  399.         // scan for SOI tag
  400.  
  401.         while(ptr < limit)
  402.             if (*ptr++ == 0xff)
  403.                 if ((tag = *ptr++) == MARKER_SOI)
  404.                     break;
  405.                 else if (tag == 0xff)
  406.                     while(ptr<limit && *ptr == 0xff)
  407.                         ++ptr;
  408.                 else {
  409. //                    _RPT0(0,"Error: markers found before SOI tag\n");
  410. //                    return;
  411.                     break;        // happens with dmb1
  412.                 }
  413.  
  414.         if (ptr >= limit) {
  415. //            _RPT0(0,"Error: SOI mark not found\n");
  416.             return;
  417.         }
  418.  
  419.         // parse out chunks
  420.  
  421.         while(ptr < limit) {
  422.             if (*ptr++ == 0xff)
  423.                 switch(tag = *ptr++) {
  424.                 case MARKER_EOI:
  425. //                    _RPT1(0,"Note: EOI tag found at %p\n", ptr-2);
  426.                     goto next_field;
  427.                 case MARKER_DQT:
  428.                     ptr = decodeQuantTables(ptr);
  429.                     break;
  430.                 case MARKER_SOF0:
  431.                     ptr = decodeFrameInfo(ptr);
  432.  
  433.                     // dmb1 thinks it's interlaced all the time...
  434.  
  435.                     if (raw_height*2 > height)
  436.                         interlaced = false;
  437.  
  438.                     break;
  439.                 case MARKER_SOS:
  440.                     pixdst = output;
  441.                     ptr = decodeScan(ptr, odd_field);
  442. //                    _RPT1(0,"scan decode finished at %p\n", ptr);
  443.                     break;
  444.                 case MARKER_APP_FIRST:
  445.                     interlaced = (ptr[6] != 0);
  446.                     odd_field = (ptr[6] > 1);
  447.                     field_height = interlaced ? height/2 : height;
  448.                     ptr += getshort(ptr);
  449.                     break;
  450.                 case 0xff:
  451.                     while(ptr<limit && *ptr == 0xff)
  452.                         ++ptr;
  453.                     break;
  454.                 case 0:
  455.                     break;
  456.                 default:
  457.                     if ((tag >= MARKER_APP_FIRST && tag <= MARKER_APP_LAST) || tag == MARKER_COMMENT) {
  458.  
  459.                         ptr += getshort(ptr);
  460.                         break;
  461.                     }
  462. //                    _RPT1(0,"Warning: Unknown tag %02x\n", tag);
  463.                 }
  464.         }
  465. next_field:
  466.         ;
  467.     } while(interlaced && field_count<2);
  468.  
  469. //    _RPT0(0,"Warning: No EOI tag found\n");
  470. }
  471.  
  472. byte *MJPEGDecoder::decodeQuantTables(byte *psrc) {
  473.     int *dst;
  474.     int n;
  475.  
  476.     psrc += 2;    // skip length
  477.     while(*psrc != 0xff) {
  478.         n = psrc[0] & 15;
  479.         if (n>3)
  480.             throw "Error: Illegal quantization table # in DQT chunk";
  481.  
  482.         dst = quant[n];
  483.         ++psrc;
  484.  
  485.         // We have to swap around the zigzag order so that the order
  486.         // of rows is: 0, 4, 1, 5, 2, 6, 3, 7.
  487.  
  488.         if (psrc[-1] & 0xf0) {
  489.             // 16-bit quantization tables
  490.  
  491.             for(n=0; n<64; n++) {
  492.                 dst[n*2+0] = getshort(psrc + n*2);
  493.                 dst[n*2+1] = ((MJPEG_zigzag[n] & 56) | ((MJPEG_zigzag[n]&3)<<1) | ((MJPEG_zigzag[n]&4)>>2))*2;
  494.             }
  495.             psrc += 128;
  496.         } else {
  497.             // 8-bit quantization tables
  498.  
  499.             for(n=0; n<64; n++) {
  500.                 dst[n*2+0] = psrc[n];
  501.                 dst[n*2+1] = ((MJPEG_zigzag[n] & 56) | ((MJPEG_zigzag[n]&3)<<1) | ((MJPEG_zigzag[n]&4)>>2))*2;
  502.             }
  503.             psrc += 64;
  504.         }
  505.  
  506. //        MJPEG_IDCT_norm(dst);
  507.     }
  508.  
  509.     return psrc;
  510. }
  511.  
  512. byte *MJPEGDecoder::decodeFrameInfo(byte *psrc) {
  513.     int i, n;
  514.  
  515.     if (psrc[2] != 8)
  516.         throw "Can only decode 8-bit images";
  517.  
  518.     raw_height = getshort(psrc + 3);
  519.     raw_width = getshort(psrc + 5);
  520.  
  521.     if (psrc[7] != 3)
  522.         throw "Error: picture must be 3 component (YCC)";
  523.  
  524.     // parse component data
  525.  
  526. //    if (psrc[8] != 0)
  527. //        throw "Error: first component must be 0";
  528.  
  529. //    if (psrc[11] != 1)
  530. //        throw "Error: second component must be 1";
  531.  
  532. //    if (psrc[14] != 2)
  533. //        throw "Error: third component must be 2";
  534.  
  535.     if (psrc[12] != psrc[15])
  536.         throw "Error: chrominance subsampling factors must be the same";
  537.  
  538.     mcu_length = 0;
  539.     for(i=0; i<3; i++) {
  540.         n = psrc[10 + 3*i];
  541.         if (n>3)
  542.             throw "Error: component specifies quantization table other than 0-3";
  543.  
  544. //        _RPT2(0,"Component %d uses quant %d\n", i, n);
  545.  
  546.         comp_quant[i] = quant[n];
  547.         comp_mcu_x[i] = psrc[9 + 3*i] >> 4;
  548.         comp_mcu_y[i] = psrc[9 + 3*i] & 15;
  549.         comp_mcu_length[i] = comp_mcu_x[i] * comp_mcu_y[i];
  550.         comp_id[i] = psrc[8 + 3*i];
  551.         comp_start[i] = mcu_length;
  552.  
  553.         mcu_length += comp_mcu_length[i];
  554.     }
  555.  
  556.     if (mcu_length > 10)
  557.         throw "Error: macroblocks per MCU > 10";
  558.  
  559.     if (comp_mcu_x[0] != 2 || comp_mcu_x[1] != 1)
  560.         throw "Error: horizontal chrominance subsampling must be 2:1";
  561.  
  562.     if ((comp_mcu_x[0] != 2 && comp_mcu_x[0] != 1) || comp_mcu_y[1] != 1)
  563.         throw "Error: vertical chrominance subsampling must be 1:1 or 2:1";
  564.  
  565.     if (comp_mcu_y[0] == 2)
  566.         vc_half = true;
  567.  
  568.     mcu_width    = (raw_width + 15)/16;
  569.  
  570.     if (vc_half)
  571.         mcu_height    = (raw_height + 15)/16;
  572.     else
  573.         mcu_height    = (raw_height + 7)/8;
  574.  
  575.     mcu_count = mcu_width * mcu_height;
  576.     mcu_size_y = comp_mcu_y[0] * 8;
  577.  
  578.     if (vc_half) {
  579.         if (mcu_height*16 > field_height) {
  580.             mcu_height    = (field_height + 15)/16;
  581.             clip_row = field_height >> 4;
  582.             clip_lines = (field_height>>1) & 7;
  583.         }
  584.     } else {
  585.         if (mcu_height*8 > field_height) {
  586.             mcu_height    = (field_height + 7)/8;
  587.             clip_row = field_height >> 3;
  588.             clip_lines = field_height & 7;
  589.         }
  590.     }
  591.  
  592.     return psrc + 8 + 3*3;
  593. }
  594.  
  595. byte *MJPEGDecoder::decodeScan(byte *ptr, bool odd_field) {
  596.     int mb=0;
  597.     int i,j;
  598.  
  599.     // Ns (components in scan) must be 3
  600.  
  601.     if (ptr[2] != 3)
  602.         throw "Error: scan must have 3 interleaved components";
  603.  
  604.     if (ptr[9] != 0 || ptr[10] != 63)
  605.         throw "Error: DCT coefficients must run from 0-63";
  606.  
  607.     if (ptr[11] != 0)
  608.         throw "Error: Successive approximation not allowed";
  609.  
  610.     // decode component order (indices 3, 5, 7)
  611.  
  612. //    if (ptr[3] != 0)
  613. //        throw "Error: component 0 must be Y";
  614.  
  615. //    if (ptr[5] != 1)
  616. //        throw "Error: component 0 must be Cr";
  617.  
  618. //    if (ptr[7] != 2)
  619. //        throw "Error: component 0 must be Cb";
  620.  
  621.  
  622.     // select entropy (Huffman) coders (indices 4, 6, 8)
  623.  
  624.     for(i=0; i<3; i++) {
  625.         for(j=0; j<3; j++)
  626.             if (ptr[3+2*i] == comp_id[j])
  627.                 break;
  628.  
  629.         if (j>=3)
  630.             throw "Error: MJPEG scan has mislabeled component";
  631.  
  632.         mb = comp_start[j];
  633.  
  634.         for(j=0; j<comp_mcu_x[i]*comp_mcu_y[i]; j++) {
  635.             blocks[mb].huff_dc    = huff_dc[ptr[4+2*i]>>4];
  636.             blocks[mb].huff_ac    = huff_ac[ptr[4+2*i]&15];
  637.             blocks[mb].huff_ac_quick = huff_ac_quick[ptr[4+2*i]&15];
  638.             blocks[mb].huff_ac_quick2 = huff_ac_quick2[ptr[4+2*i]&15];
  639.             blocks[mb].quant    = comp_quant[i];
  640.             blocks[mb].dc_ptr    = &comp_last_dc[i];
  641.             ++mb;
  642.         }
  643.  
  644. //        comp_last_dc[i] = 128*8;
  645.     }
  646.  
  647.     for(i=0; i<mcu_length; i++) {
  648.         blocks[i+mcu_length] = blocks[i];
  649.         blocks[i+mcu_length*2] = blocks[i];
  650.         blocks[i+mcu_length*3] = blocks[i];
  651.         dct_coeff_ptrs[i] = &dct_coeff[i][0];
  652.         dct_coeff_ptrs[i+mcu_length] = &dct_coeff[i+mcu_length][0];
  653.         dct_coeff_ptrs[i+mcu_length*2] = &dct_coeff[i+mcu_length*2][0];
  654.         dct_coeff_ptrs[i+mcu_length*3] = &dct_coeff[i+mcu_length*3][0];
  655.     }
  656.  
  657.     comp_last_dc[0] = 128*8;
  658.     comp_last_dc[1] = 0;
  659.     comp_last_dc[2] = 0;
  660.  
  661.     ptr += 12;
  662.  
  663.     return decodeMCUs(ptr, odd_field);
  664. }
  665.  
  666. // 320x240 -> 20x30 -> 600 MCUs
  667. // 304x228 -> 19x29 -> 551 MCUs 
  668.  
  669. byte *MJPEGDecoder::decodeMCUs(byte *ptr, bool odd_field) {
  670.     int mcu;
  671.     dword bitbuf = 0;
  672.     int bitcnt = 24;    // 24 - bits in buffer
  673.     dword *pixptr = (dword *)pixdst;
  674.     int mb_x = 0, mb_y = 0;
  675.     long modulo0;
  676.     long modulo1;
  677.     long modulo2;
  678.     long modulo3;
  679.     long lines = 8;
  680.     __int64 mb_cycles = 0;
  681.     __int64 dct_cycles = 0;
  682.     __int64 cvt_cycles = 0;
  683.  
  684.     pixptr += mcu_width*(decode16 ? 8 : 16) * (height - (vc_half?2:1));
  685.  
  686.     if (interlaced) {
  687.         if (vc_half)
  688.             pixptr -= mcu_width*(decode16 ? 8 : 16) *(odd_field?1:2);
  689.         else if (!odd_field)
  690.             pixptr -= mcu_width*(decode16 ? 8 : 16);
  691.     }
  692.  
  693.     if (vc_half) {
  694.         long bpr = mcu_width * 4 * (decode16 ? 8 : 16);
  695.  
  696.         if (interlaced) {
  697.             modulo0 = 4*bpr + (decode16 ? 32 : 64);
  698.             modulo1 = 2*bpr;
  699.         } else {
  700.             modulo0 = 2*bpr + (decode16 ? 32 : 64);
  701.             modulo1 = bpr;
  702.         }
  703.  
  704.         modulo2 = modulo3 = 4;
  705.     } else {
  706.         modulo0 = (decode16 ? 1 : 2) * (mcu_width*(interlaced ? 64 : 32) + 16);
  707.         modulo1 = (decode16 ? 1 : 2) * (mcu_width*(interlaced ? 512 : 256) + 16);
  708.         modulo2 = 128 - 8;
  709.         modulo3 = 0;
  710.     }
  711.  
  712.  
  713.     for(mcu=0; mcu<mcu_length*4; mcu++)
  714.         memset(dct_coeff[mcu], 0, 128);
  715.  
  716.     for(mcu=0; mcu<mcu_count; mcu+=4) {
  717. //    for(mcu=0; mcu<200; mcu++) {
  718.  
  719.         int mcus = 4;
  720.  
  721.         if (mcu >= mcu_count-4)
  722.             mcus = mcu_count - mcu;
  723.  
  724. #ifdef PROFILE
  725.         __asm {
  726.             rdtsc
  727.             sub dword ptr mb_cycles+0,eax
  728.             sbb dword ptr mb_cycles+4,edx
  729.         }
  730. #endif
  731.  
  732.         asm_mb_decode(bitbuf, bitcnt, ptr, mcu_length*mcus, blocks, dct_coeff_ptrs);
  733.  
  734. #ifdef PROFILE
  735.         __asm {
  736.             rdtsc
  737.             add dword ptr mb_cycles+0,eax
  738.             adc dword ptr mb_cycles+4,edx
  739.             sub dword ptr dct_cycles+0,eax
  740.             sbb dword ptr dct_cycles+4,edx
  741.         }
  742. #endif
  743.  
  744.         for(int i=0; i<mcu_length*mcus; i++)
  745.             IDCT_mmx(dct_coeff[i], dct_coeff[i], 16, 2, blocks[i].ac_last);
  746.  
  747. #ifdef PROFILE
  748.         __asm {
  749.             rdtsc
  750.             add dword ptr dct_cycles+0,eax
  751.             adc dword ptr dct_cycles+4,edx
  752.             sub dword ptr cvt_cycles+0,eax
  753.             sbb dword ptr cvt_cycles+4,edx
  754.         }
  755. #endif
  756.         for(i=0; i<mcus; i++) {
  757.             short *dct_coeffs = (short *)&dct_coeff[mcu_length * i];
  758.  
  759.             static const __int64 Cr_coeff = 0x0000005AFFD20000i64;
  760.             static const __int64 Cb_coeff = 0x00000000FFEA0071i64;
  761.  
  762.             static const __int64 C_bias = 0x0000008000000080i64;
  763.             static const __int64 C_bias2 = 0x0080008000800080i64;
  764.  
  765.             static const __int64 Cr_coeff_R = 0x005A005A005A005Ai64;
  766.             static const __int64 Cr_coeff_G = 0xFFD2FFD2FFD2FFD2i64;
  767.  
  768.             static const __int64 CrCb_coeff_G = 0xFFD2FFEAFFD2FFEAi64;
  769.  
  770.             static const __int64 Cb_coeff_B = 0x0071007100710071i64;
  771.             static const __int64 Cb_coeff_G = 0xFFEAFFEAFFEAFFEAi64;
  772.             static const __int64 mask5        = 0xF8F8F8F8F8F8F8F8i64;
  773.  
  774.             static const __int64 G_const_1    = 0x7C007C007C007C00i64;
  775.             static const __int64 G_const_2    = 0x7C007C007C007C00i64;
  776.             static const __int64 G_const_3    = 0x03e003e003e003e0i64;
  777.  
  778.             if (vc_half) {
  779.                 if (!decode16)
  780.                 __asm {
  781.                     push        ebp
  782.                     push        edi
  783.                     push        esi
  784.  
  785.                     mov            eax,dct_coeffs
  786.                     mov            edx,dword ptr pixptr
  787.  
  788.                     push        modulo3
  789.                     push        modulo2
  790.                     push        modulo1
  791.                     push        modulo0
  792.  
  793.                     mov            ebx,[esp + 4]
  794.  
  795.                     mov            ecx,eax
  796.                     add            eax,512
  797.  
  798.                     mov            ebp,2
  799.     zloop420:
  800.                     mov            edi,[esp + 4 + ebp*4]
  801.                     or            edi,edi
  802.                     jz            fastexit32
  803.     yloop420:
  804.                     mov            esi,4
  805.     xloop420:
  806.                     movq        mm0,[ecx]        ;Y (0A,1A,2A,3A)
  807.                     pxor        mm7,mm7
  808.                     movq        mm1,[ecx+16]    ;Y (0B,1B,2B,3B)
  809.                     movd        mm2,[eax+128]    ;Cr
  810.                     movd        mm3,[eax]        ;Cb
  811.                     movq        [ecx],mm7
  812.                     movq        [ecx+16],mm7
  813.                     movd        [eax],mm7
  814.                     movd        [eax+128],mm7
  815.                     psllw        mm0,6
  816.                     psllw        mm1,6
  817.                     punpcklwd    mm2,mm2
  818.                     punpcklwd    mm3,mm3
  819.                     movq        mm4,mm2
  820.                     movq        mm5,mm3
  821.                     pmullw        mm4,Cr_coeff_G
  822.                     pmullw        mm5,Cb_coeff_G
  823.                     pmullw        mm2,Cr_coeff_R
  824.                     pmullw        mm3,Cb_coeff_B
  825.                     paddw        mm4,mm5
  826.  
  827.                     movq        mm5,mm0
  828.                     movq        mm6,mm0
  829.                     paddw        mm0,mm2
  830.                     paddw        mm5,mm4
  831.                     paddw        mm6,mm3
  832.                     psraw        mm0,6
  833.                     psraw        mm5,6
  834.                     psraw        mm6,6
  835.                     packuswb    mm0,mm0
  836.                     packuswb    mm5,mm5
  837.                     packuswb    mm6,mm6
  838.                     punpcklbw    mm6,mm0
  839.                     punpcklbw    mm5,mm5
  840.                     movq        mm0,mm6
  841.                     punpcklbw    mm0,mm5
  842.                     punpckhbw    mm6,mm5
  843.                     movq        [edx+ebx],mm0
  844.                     movq        [edx+ebx+8],mm6
  845.  
  846.                     movq        mm5,mm1
  847.                     movq        mm7,mm1
  848.                     paddw        mm1,mm2
  849.                     paddw        mm5,mm4
  850.                     paddw        mm7,mm3
  851.                     psraw        mm1,6
  852.                     psraw        mm5,6
  853.                     psraw        mm7,6
  854.                     packuswb    mm1,mm1
  855.                     packuswb    mm5,mm5
  856.                     packuswb    mm7,mm7
  857.                     punpcklbw    mm7,mm1
  858.                     punpcklbw    mm5,mm5
  859.                     movq        mm1,mm7
  860.                     punpcklbw    mm1,mm5
  861.                     punpckhbw    mm7,mm5
  862.                     movq        [edx],mm1
  863.                     movq        [edx+8],mm7
  864.  
  865.  
  866.                     add            eax,4
  867.                     add            ecx,8
  868.                     add            edx,16
  869.  
  870.                     test        esi,1
  871.                     jz            noblockskip32
  872.  
  873.                     add            ecx,7*16
  874. noblockskip32:
  875.  
  876.                     dec            esi
  877.                     jne            xloop420
  878.  
  879.                     sub            ecx,14*16
  880.                     sub            edx,dword ptr [esp + 0]        /* 2*bpr + 64 */
  881.  
  882.                     dec            edi
  883.                     jne            yloop420
  884.  
  885.                     add            ecx,8*16
  886.  
  887.                     dec            ebp
  888.                     jne            zloop420
  889. fastexit32:
  890.                     add            esp,16
  891.  
  892.                     pop            esi
  893.                     pop            edi
  894.                     pop            ebp
  895.                 }
  896.                 else
  897.                 __asm {
  898.                     push        ebp
  899.                     push        edi
  900.                     push        esi
  901.  
  902.                     mov            eax,dct_coeffs
  903.                     mov            edx,dword ptr pixptr
  904.  
  905.                     push        modulo3
  906.                     push        modulo2
  907.                     push        modulo1
  908.                     push        modulo0
  909.  
  910.                     mov            ebx,[esp + 4]
  911.  
  912.                     mov            ecx,eax
  913.                     add            eax,512
  914.  
  915.                     mov            ebp,2
  916.     zloop2420:
  917.                     mov            edi,[esp + 4 + ebp*4]
  918.                     or            edi,edi
  919.                     jz            fastexit16
  920.     yloop2420:
  921.                     mov            esi,4
  922.     xloop2420:
  923.                     movq        mm0,[ecx]        ;Y (0A,1A,2A,3A)
  924.                     pxor        mm7,mm7
  925.                     movq        mm1,[ecx+16]    ;Y (0B,1B,2B,3B)
  926.                     movd        mm2,[eax+128]    ;Cr
  927.                     movd        mm3,[eax]        ;Cb
  928.                     movq        [ecx],mm7
  929.                     movq        [ecx+16],mm7
  930.                     movd        [eax],mm7
  931.                     movd        [eax+128],mm7
  932.                     psllw        mm0,6
  933.                     psllw        mm1,6
  934.                     punpcklwd    mm2,mm2
  935.                     punpcklwd    mm3,mm3
  936.                     movq        mm4,mm2
  937.                     movq        mm5,mm3
  938.                     pmullw        mm4,Cr_coeff_G
  939.                     pmullw        mm5,Cb_coeff_G
  940.                     pmullw        mm2,Cr_coeff_R
  941.                     pmullw        mm3,Cb_coeff_B
  942.                     paddw        mm4,mm5
  943.  
  944.                     pxor        mm7,mm7
  945.                     movq        mm5,mm0
  946.                     movq        mm6,mm0
  947.                     paddw        mm0,mm2
  948.                     paddw        mm5,mm4
  949.                     paddw        mm6,mm3
  950.                     psraw        mm0,6
  951.                     psraw        mm5,6
  952.                     psraw        mm6,6
  953.                     packuswb    mm0,mm0
  954.                     packuswb    mm6,mm6
  955.                     packuswb    mm5,mm5
  956.                     pand        mm0,mask5
  957.                     pand        mm6,mask5
  958.                     pand        mm5,mask5
  959.                     psrlq        mm0,1
  960.                     psrlq        mm6,3
  961.                     punpcklbw    mm6,mm0
  962.                     punpcklbw    mm5,mm7
  963.                     psllq        mm5,2
  964.                     por            mm6,mm5
  965.                     movq        [edx+ebx],mm6
  966.  
  967.                     pxor        mm0,mm0
  968.                     movq        mm5,mm1
  969.                     movq        mm7,mm1
  970.                     paddw        mm1,mm2
  971.                     paddw        mm5,mm4
  972.                     paddw        mm7,mm3
  973.                     psraw        mm1,6
  974.                     psraw        mm5,6
  975.                     psraw        mm7,6
  976.                     packuswb    mm1,mm1
  977.                     packuswb    mm5,mm5
  978.                     packuswb    mm7,mm7
  979.                     pand        mm1,mask5
  980.                     pand        mm7,mask5
  981.                     pand        mm5,mask5
  982.                     psrlq        mm1,1
  983.                     psrlq        mm7,3
  984.                     punpcklbw    mm7,mm1
  985.                     punpcklbw    mm5,mm0
  986.                     psllq        mm5,2
  987.                     por            mm7,mm5
  988.                     movq        [edx],mm7
  989.  
  990.  
  991.                     add            eax,4
  992.                     add            ecx,8
  993.                     add            edx,8
  994.  
  995.                     test        esi,1
  996.                     jz            noblockskip16
  997.  
  998.                     add            ecx,7*16
  999. noblockskip16:
  1000.  
  1001.                     dec            esi
  1002.                     jne            xloop2420
  1003.  
  1004.                     sub            ecx,14*16
  1005.                     sub            edx,dword ptr [esp + 0]        /* 2*bpr + 64 */
  1006.  
  1007.                     dec            edi
  1008.                     jne            yloop2420
  1009.  
  1010.                     add            ecx,8*16
  1011.  
  1012.                     dec            ebp
  1013.                     jne            zloop2420
  1014. fastexit16:
  1015.                     add            esp,16
  1016.  
  1017.                     pop            esi
  1018.                     pop            edi
  1019.                     pop            ebp
  1020.                 }
  1021.             } else {
  1022.                 if (!decode16)
  1023.                 __asm {
  1024.                     push        ebp
  1025.                     push        edi
  1026.                     push        esi
  1027.  
  1028.                     mov            eax,dct_coeffs
  1029.                     mov            edx,dword ptr pixptr
  1030.  
  1031.                     push        lines
  1032.                     push        modulo3
  1033.                     push        modulo2
  1034.                     push        modulo1
  1035.  
  1036.                     mov            ebx,modulo0
  1037.  
  1038.                     mov            ecx,eax
  1039.                     add            eax,256
  1040.  
  1041.                     mov            ebp,2
  1042.     zloop:
  1043.                     mov            edi,[esp + 12]
  1044.     yloop:
  1045.                     mov            esi,2
  1046.     xloop:
  1047.                     movd        mm4,[eax+128]    ;Cr (0,1)
  1048.                     pxor        mm1,mm1
  1049.  
  1050.                     movd        mm6,[eax]        ;Cb (0,1)
  1051.                     punpcklwd    mm4,mm4
  1052.  
  1053.                     movq        mm0,[ecx]        ;Y (0,1,2,3)
  1054.                     punpcklwd    mm6,mm6
  1055.  
  1056.                     movd        [eax],mm1
  1057.                     psllw        mm0,6
  1058.  
  1059.                     movd        [eax+128],mm1
  1060.                     movq        mm5,mm4
  1061.  
  1062.                     movq        [ecx],mm1    
  1063.                     movq        mm7,mm6
  1064.                     punpckldq    mm4,mm4        ;Cr (0,0,0,0)
  1065.  
  1066.                     pmullw        mm4,Cr_coeff
  1067.                     punpckhdq    mm5,mm5        ;Cr (1,1,1,1)
  1068.  
  1069.                     movq        mm2,mm0
  1070.                     punpcklwd    mm0,mm0            ;Y (0,0,1,1)
  1071.  
  1072.                     pmullw        mm5,Cr_coeff
  1073.                     punpckhwd    mm2,mm2            ;Y (2,2,3,3)
  1074.  
  1075.                     movq        mm1,mm0
  1076.                     punpckldq    mm0,mm0            ;Y (0,0,0,0)
  1077.  
  1078.                     movq        mm3,mm2
  1079.                     punpckhdq    mm1,mm1            ;Y (1,1,1,1)
  1080.  
  1081.                     punpckldq    mm6,mm6        ;Cb (0,0,0,0)
  1082.                     add            ecx,8
  1083.  
  1084.                     pmullw        mm6,Cb_coeff
  1085.                     punpckhdq    mm7,mm7        ;Cb (1,1,1,1)
  1086.  
  1087.                     pmullw        mm7,Cb_coeff
  1088.                     punpckhdq    mm3,mm3            ;Y (3,3,3,3)
  1089.  
  1090.                     punpckldq    mm2,mm2            ;Y (2,2,2,2)
  1091.                     add            eax,4
  1092.  
  1093.                     paddsw        mm4,mm6
  1094.                     paddsw        mm0,mm4
  1095.  
  1096.                     paddsw        mm5,mm7
  1097.                     psraw        mm0,6
  1098.  
  1099.                     paddsw        mm1,mm4
  1100.                     paddsw        mm2,mm5
  1101.  
  1102.                     psraw        mm1,6
  1103.                     paddsw        mm3,mm5
  1104.  
  1105.                     psraw        mm2,6
  1106.                     packuswb    mm0,mm1
  1107.  
  1108.                     psraw        mm3,6
  1109.                     packuswb    mm2,mm3
  1110.  
  1111.                     movq        [edx],mm0
  1112.                     add            edx,16
  1113.  
  1114.                     dec            esi
  1115.                     movq        [edx-8],mm2
  1116.                     jne            xloop
  1117.  
  1118.                     sub            edx,ebx                        /* 304*4 + 32 */
  1119.                     add            eax,8
  1120.  
  1121.                     dec            edi
  1122.                     jne            yloop
  1123.  
  1124.                     sub            eax,dword ptr [esp + 4]        /* 256-16 */
  1125.                     add            edx,dword ptr [esp + 0]        /* 304*4*8 + 32 */
  1126.                     add            ecx,dword ptr [esp + 8]
  1127.  
  1128.                     dec            ebp
  1129.                     jne            zloop
  1130.  
  1131.                     add            esp,16
  1132.  
  1133.                     pop            esi
  1134.                     pop            edi
  1135.                     pop            ebp
  1136.                 }
  1137.                 else
  1138.                 __asm {
  1139.                     push        ebp
  1140.                     push        edi
  1141.                     push        esi
  1142.  
  1143.                     mov            eax,dct_coeffs
  1144.                     mov            edx,dword ptr pixptr
  1145.  
  1146.                     push        lines
  1147.                     push        modulo3
  1148.                     push        modulo2
  1149.                     push        modulo1
  1150.                     mov            ebx,modulo0
  1151.  
  1152.                     mov            ecx,eax
  1153.                     add            eax,256
  1154.  
  1155.                     mov            ebp,2
  1156.     zloop2:
  1157.     //                pushad
  1158.     //                push        ecx
  1159.     //                call        MJPEG_IDCT
  1160.     //                pop            ecx
  1161.     //                popad
  1162.  
  1163.                     movq        mm6,mask5
  1164.                     pxor        mm7,mm7
  1165.     ;                movq        mm2,CrCb_coeff_G
  1166.     ;                movq        mm1,x4000400040004000
  1167.     ;                movq        mm7,Cr_coeff_R
  1168.                     mov            edi,[esp + 12]
  1169.     yloop2:
  1170.                     mov            esi,2
  1171.     xloop2:
  1172.                     movd        mm5,[eax]        ;Cb (0,1)
  1173.  
  1174.                     movd        mm3,[eax+128]    ;Cr (0,1)
  1175.                     movq        mm4,mm5            ;Cb [duplicate]
  1176.  
  1177.                     movq        mm0,[ecx]        ;Y (0,1,2,3)
  1178.                     punpcklwd    mm5,mm5            ;Cb [subsampling]
  1179.  
  1180.                     pmullw        mm5,Cb_coeff_B    ;Cb [produce blue impacts]
  1181.                     punpcklwd    mm4,mm3            ;mm4: [Cr1][Cb1][Cr0][Cb0]
  1182.  
  1183.                     pmaddwd        mm4,CrCb_coeff_G
  1184.                     punpcklwd    mm3,mm3            ;Cr [subsampling]
  1185.  
  1186.                     pmullw        mm3,Cr_coeff_R    ;Cr [produce red impacts]
  1187.                     psllw        mm0,6
  1188.  
  1189.                     paddw        mm5,mm0            ;B (0,1,2,3)
  1190.                     add            edx,8
  1191.  
  1192.                     packssdw    mm4,mm4            ;green impacts
  1193.  
  1194.                     paddw        mm3,mm0            ;R (0,1,2,3)
  1195.                     psraw        mm5,6
  1196.  
  1197.                     paddw        mm4,mm0            ;G (0,1,2,3)
  1198.                     psraw        mm3,6
  1199.  
  1200.                     movq        [ecx],mm7
  1201.                     packuswb    mm3,mm3
  1202.  
  1203.                     psraw        mm4,4
  1204.                     pand        mm3,mm6
  1205.  
  1206.                     paddsw        mm4,G_const_1
  1207.                     packuswb    mm5,mm5
  1208.  
  1209.                     pand        mm5,mm6
  1210.                     psrlq        mm3,1
  1211.  
  1212.                     psubusw        mm4,G_const_2
  1213.                     psrlq        mm5,3
  1214.  
  1215.                     pand        mm4,G_const_3
  1216.                     punpcklbw    mm5,mm3
  1217.  
  1218.                     por            mm5,mm4
  1219.                     add            eax,4
  1220.  
  1221.                     add            ecx,8
  1222.                     dec            esi
  1223.  
  1224.                     movq        [edx-8],mm5
  1225.                     jne            xloop2
  1226.  
  1227.                     sub            edx,ebx                        /* 304*4 + 32 */
  1228.                     add            eax,8
  1229.  
  1230.                     movq        [eax-16],mm7
  1231.                     movq        [eax+128-16],mm7
  1232.  
  1233.                     dec            edi
  1234.                     jne            yloop2
  1235.  
  1236.                     sub            eax,dword ptr [esp + 4]        /* 256-16 */
  1237.                     add            edx,dword ptr [esp + 0]        /* 304*4*8 + 32 */
  1238.                     add            ecx,dword ptr [esp + 8]
  1239.  
  1240.                     dec            ebp
  1241.                     jne            zloop2
  1242.  
  1243.                     add            esp,16
  1244.  
  1245.                     pop            esi
  1246.                     pop            edi
  1247.                     pop            ebp
  1248.                 }
  1249.             }
  1250.  
  1251.             if (lines < 8)
  1252.                 memset(dct_coeffs, 0, mcu_length * 64 * sizeof(short));
  1253.  
  1254.             pixptr += decode16 ? 8 : 16;
  1255.             if (++mb_x >= mcu_width) {
  1256.                 long bpr = mcu_width * (decode16 ? 8 : 16);
  1257.                 mb_x = 0;
  1258.  
  1259.                 if (vc_half) {
  1260.                     if (interlaced)
  1261.                         pixptr -= bpr * (4*lines+1);
  1262.                     else
  1263.                         pixptr -= bpr * (2*lines+1);
  1264.                 } else {
  1265.                     if (interlaced)
  1266.                         pixptr -= bpr * (2*lines+1);
  1267.                     else
  1268.                         pixptr -= bpr * (lines+1);
  1269.                 }
  1270.  
  1271.                 if (++mb_y == clip_row) {
  1272.                     int cl = clip_lines;
  1273.  
  1274.                     if (!vc_half) {
  1275.                         if (decode16)
  1276.                             modulo1 = mcu_width*16*2*cl + 16;
  1277.                         else
  1278.                             modulo1 = mcu_width*16*4*cl + 32;
  1279.                         modulo2 = 16*cl - 16;
  1280.                         modulo3 = 32*(8-cl);
  1281.                     } else {
  1282.                         modulo3 = cl;
  1283.                         modulo2 = cl-4;
  1284.  
  1285.                         if (modulo3>4)
  1286.                             modulo3=4;
  1287.  
  1288.                         if (modulo2<0)
  1289.                             modulo2=0;
  1290.                     }
  1291.                     lines = cl;
  1292.                 }
  1293.             }
  1294.         }
  1295. #ifdef PROFILE
  1296.         __asm {
  1297.             rdtsc
  1298.             add dword ptr cvt_cycles+0,eax
  1299.             adc dword ptr cvt_cycles+4,edx
  1300.         }
  1301. #endif
  1302.     }
  1303.  
  1304. #ifdef PROFILE
  1305.     {
  1306.         char buf[128];
  1307.         static __int64 tcycles;
  1308.         static __int64 tcyclesDCT;
  1309.         static __int64 tcyclesCVT;
  1310.         static int tcount;
  1311.  
  1312.         tcycles += mb_cycles;
  1313.         tcyclesDCT += dct_cycles;
  1314.         tcyclesCVT += cvt_cycles;
  1315.         tcount += mcu_count;
  1316.  
  1317.         if (tcount >= 65536) {
  1318.             sprintf(buf, "decode: %4I64d (%3d%% CPU)     IDCT: %4I64d (%3d%% CPU)    CVT: %4I64d (%3d%% CPU)\n",
  1319.                         tcycles/tcount,
  1320.                         (long)((tcycles*24*2997+tcount*2250000i64)/(tcount*4500000i64)),
  1321.                         tcyclesDCT/tcount,
  1322.                         (long)((tcyclesDCT*24*2997+tcount*2250000i64)/(tcount*4500000i64)),
  1323.                         tcyclesCVT/tcount,
  1324.                         (long)((tcyclesCVT*24*2997+tcount*2250000i64)/(tcount*4500000i64))
  1325.                         );
  1326.             OutputDebugString(buf);
  1327.             tcount = 0;
  1328.             tcycles = 0;
  1329.             tcyclesDCT = 0;
  1330.             tcyclesCVT = 0;
  1331.  
  1332.  
  1333. #ifdef DCTLEN_PROFILE
  1334.             sprintf(buf, "%d short coefficients, %d medium, %d long\n", short_coeffs, med_coeffs, long_coeffs);
  1335.             OutputDebugString(buf);
  1336.             short_coeffs = med_coeffs = long_coeffs = 0;
  1337. #endif
  1338.         }
  1339.     }
  1340. #endif
  1341.  
  1342.     __asm emms
  1343.  
  1344. //    return ptr - ((31-bitcnt)>>3);
  1345.     return ptr - 8;
  1346. }
  1347.